{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "graffitiCellId": "id_nbx3c48"
   },
   "outputs": [],
   "source": [
    "from sklearn import datasets\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "graffitiCellId": "id_xr02q5o",
    "graffitiConfig": {
     "rows": 6,
     "startingDirectory": "irisAnalysis",
     "terminalId": "id_xr02q5o",
     "type": "terminal"
    }
   },
   "source": [
    "<i>Loading terminal (id_xr02q5o), please wait...</i>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "graffitiCellId": "id_boc9k9n"
   },
   "outputs": [],
   "source": [
    "from pandas_profiling import ProfileReport\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import tree\n",
    "from sklearn.metrics import classification_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "graffitiCellId": "id_kd32mrl"
   },
   "source": [
    "## 数据载入"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "graffitiCellId": "id_t1wgcjt"
   },
   "outputs": [],
   "source": [
    "iris = datasets.load_iris()\n",
    "X = iris.data\n",
    "y = iris.target"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "graffitiCellId": "id_xv6z44t"
   },
   "source": [
    "## 探索性数据分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "graffitiCellId": "id_i6hzqj6"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sepal length (cm)</th>\n",
       "      <th>sepal width (cm)</th>\n",
       "      <th>petal length (cm)</th>\n",
       "      <th>petal width (cm)</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \\\n",
       "0                5.1               3.5                1.4               0.2   \n",
       "1                4.9               3.0                1.4               0.2   \n",
       "2                4.7               3.2                1.3               0.2   \n",
       "3                4.6               3.1                1.5               0.2   \n",
       "4                5.0               3.6                1.4               0.2   \n",
       "\n",
       "   label  \n",
       "0      0  \n",
       "1      0  \n",
       "2      0  \n",
       "3      0  \n",
       "4      0  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(data=iris.data, columns=iris.feature_names)\n",
    "df['label'] = y\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "graffitiCellId": "id_n3hbuis",
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "683cb181b71f48a28e1b1627fea8ddae",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='variables', max=5.0, style=ProgressStyle(description_widt…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "009a0fbf18ea457994c28bb492eb6e21",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='correlations', max=6.0, style=ProgressStyle(description_w…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8a58ffb1cc4f4d08b9025ba60545f72f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='interactions [continuous]', max=16.0, style=ProgressStyle…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "98fb58f7f327494bbc229f56ca4735ac",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='table', max=1.0, style=ProgressStyle(description_width='i…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7182fed3c5b343e1b34f717065647ad5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='missing', max=2.0, style=ProgressStyle(description_width=…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3945de5a74c24b58878e5324cef58995",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='warnings', max=3.0, style=ProgressStyle(description_width…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a638ad740f274475a2012ce9d880104d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='package', max=1.0, style=ProgressStyle(description_width=…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4700fe57234b449c8daddf844eda06e4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='build report structure', max=1.0, style=ProgressStyle(des…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "profile = ProfileReport(df, title='Pandas Profiling Report', html={'style':{'full_width':True}})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "graffitiCellId": "id_6mk3nta"
   },
   "source": [
    "在下面的探索性数据分析中，我们发现，首先原始数据已经处理的比较好了，没有缺失值，仅有一行重复数据。其次出于这里的问题是多分类，我们需要注意各类的样本比，经检查这里也是没有类别不均衡问题的，三类的数据各50个，十分平均。同时，这里部分变量之间是存在较强的相关关系的，但是由于特征个数过少，所以我们也不进行去除。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如果下面的内容无法显示，分析报告见文件`irisReport.html`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "graffitiCellId": "id_terp93l"
   },
   "outputs": [],
   "source": [
    "profile.to_file(output_file=\"irisReport.html\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "graffitiCellId": "id_dd2gl3w"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "932b22915c4f44d99096b37267fdd717",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(value='Number of va…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Report generated with <a href=\"https://github.com/pandas-profiling/pandas-profiling\">pandas-profiling</a>."
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "profile.to_widgets()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "graffitiCellId": "id_j1d8mu7"
   },
   "source": [
    "## 特征工程\n",
    "\n",
    "因为这里的特征比较少，所以无需进行变量筛选。对于大多数模型来说可以直接将原始变量导入，对于部分特殊的模型可能需要进行标准化或中心化等操作。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "graffitiCellId": "id_76uhqsx"
   },
   "source": [
    "## 模型建立"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "graffitiCellId": "id_qel74x1"
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.33, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "graffitiCellId": "id_51fq3nj"
   },
   "outputs": [],
   "source": [
    "clf = tree.DecisionTreeClassifier()\n",
    "clf = clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "graffitiCellId": "id_gicsft3"
   },
   "outputs": [],
   "source": [
    "y_pred = clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "graffitiCellId": "id_0s5ahzw"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "      setosa       1.00      1.00      1.00        19\n",
      "  versicolor       1.00      1.00      1.00        15\n",
      "   virginica       1.00      1.00      1.00        16\n",
      "\n",
      "    accuracy                           1.00        50\n",
      "   macro avg       1.00      1.00      1.00        50\n",
      "weighted avg       1.00      1.00      1.00        50\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(y_test, y_pred, target_names=iris.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "graffitiCellId": "id_bzxj1p1"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "graffiti": {
   "firstAuthorId": "dev",
   "id": "id_7cibpgd",
   "language": "EN"
  },
  "kernelspec": {
   "display_name": "ML",
   "language": "python",
   "name": "ml"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
